Following is a simple datascience framework followed to carefully understand our data and build models
I used the version published on Kaggle in early 2017 for this project, which covers terrorist attacks happened between 1970 and 2015. (This database has been updated in July 2017 to include incidents in 2016.)
This data includes ~170K attacks of which Taliban was responsible for 6575 of the attacks (the highest)
#### Importing packages
#Data manipulation
import pandas as pd
import seaborn as sns
import numpy as np
#visualisations
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 8)
matplotlib.style.use('ggplot')
from plotnine import *
#Models
import xlrd
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
#from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
#from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
#Other
import warnings
warnings.filterwarnings('ignore')
import itertools
#Read the data
events = pd.read_csv('data.csv', index_col='eventid')
events.head()
Below is a table that summarises all the variables in the dataset into different broad categories. We'll discuss more about each variable as we proceed with our analysis.
| Variable Categories | Variable Names | |
|---|---|---|
| GTD ID and Date | eventid, iyear, imonth, iday, extended | |
| Attack Information | summary, doubtterr, multiple, related | |
| Attack Location | country_txt, region_txt, provstate, latitude, longitude | |
| Attack type | attacktype1_txt, attacktype2_txt, attacktype3_txt, success, suicide | |
| Weapon Information | weaptype1_txt, weaptype2_txt, weaptype3_txt, weaptype4_txt | |
| Target/Victim Information | target1, targtype1_txt, natlty1_txt, target2, targtype2_txt, natlty2_txt, target3, targtype3_txt, natlty3_txt | |
| Perpetrator Information | gname, gname2, gname3, guncertain1, guncertain2, guncertain3, nperps, nperpcap, claimed, compclaim | |
| Casualties and deaths | nkill, nkillter, nwound, nwoundte, property, propextent, ishostkid, nhostkid | |
| Additional variables | INT_LOG, INT_IDEO, INT_MISC, INT_ANY, 'specificity', 'individual', 'dbsource' |
#Replace NA's of a few categorical variables with "Unknown"
def do():
events['provstate'].fillna('Unknown', inplace=True)
events['country'].fillna('Unknown', inplace=True)
events['gname'].fillna('Unknown', inplace=True)
events['corp1'].fillna('Unknown',inplace = True)
do()
# Remove unknown target variables
e2 = events[events['gname']!='Unknown']
print("Shape of the data after removing unknown gnames")
print e2.shape
# Doubter is a variable that tells you whether there is a doubt as to an incident being an act of terrorism
# 1 = "Yes there is a doubt"; 0 = "No there is no doubt"
# Subset data with doubtter=0
e2 = e2[e2['doubtterr']==0]
print("Shape of the data after removing doubtful terrorist attacks")
print e2.shape
#Removing columns that have a missing value > 45%
def remove_missing():
missing_freq = e2.isnull().sum()
missing_freq_df = pd.DataFrame({'column':missing_freq.index,'no_missing':missing_freq.values})
missing_freq_df = missing_freq_df.sort_values(by='no_missing',ascending = False)
missing_freq_df['per'] = (missing_freq_df['no_missing']/92044)*100
#print missing_freq_df[missing_freq_df.per<45].shape
missing_freq_df_45 = missing_freq_df[missing_freq_df.per<45]
missing_subset_list=list(missing_freq_df_45.column)
e3 = e2[missing_subset_list]
print "Function successful"
return e3
e3 = remove_missing()
print("Shape of the data after removing missing value columns")
print e3.shape
print e3.columns
#Remove other unwanted columns
def drop_columns():
#Remove text columns
text_drop = ['weapdetail', 'weapsubtype1_txt', 'targsubtype1_txt','weaptype1_txt','region_txt','country_txt',
'attacktype1_txt','targtype1_txt','target1','natlty1_txt','city','scite1','summary']
e3.drop(text_drop, inplace=True, axis=1)
#Remove other columns
other_drop = ['latitude', 'longitude', 'specificity', 'individual', 'dbsource','iday','corp1',
'guncertain1','INT_MISC','INT_LOG', 'INT_IDEO','nperpcap','nwoundus','claimed','nkillus','doubtterr']
e3.drop(other_drop,inplace=True,axis=1)
print "Function successful"
return e3
e3 = drop_columns()
print e3.columns
#Removing groups which have less than 500 attacks
def remove_lt_500():
groups = e3['gname'].value_counts().to_dict()
groupsdf = pd.DataFrame.from_dict(groups, orient='index')
groupsdf.reset_index(inplace=True)
groupsdf.columns = ['group', '#events']
groupsdf = groupsdf.sort_values(by=['#events'],ascending=False)
test = groupsdf[groupsdf['#events'] > 500]
group_list = list(test["group"])
e4 = e3[e3.gname.isin(group_list)]
return e4
e4 = remove_lt_500()
print len(e4.gname.unique())
def impute_missing():
#Numeric columns
#nkill 4331 missing values
#nwound 6890 missing values
e4['nkill'] = e4.groupby("gname").nkill.transform(lambda x:x.fillna(x.median()))
e4['nwound'] = e4.groupby("gname").nwound.transform(lambda x:x.fillna(x.median()))
e4['nwoundte'] = e4.groupby("gname").nwoundte.transform(lambda x:x.fillna(x.median()))
e4['nperps'] = e4['nperps'].replace(-99,np.nan)
e4['nperps'] = e4['nperps'].replace(-9,np.nan)
e4['nperps'] = e4.groupby("gname").nperps.transform(lambda x:x.fillna(x.median()))
e4['nkillter'] = e4.groupby("gname").nkillter.transform(lambda x:x.fillna(x.median()))
# Categorical variables
# ishostkid -9 = unknown/missing
e4['ishostkid'] = e4['ishostkid'].replace(-9,np.nan)
e4['ishostkid'] = e4.groupby("gname").ishostkid.transform(lambda x:x.fillna(x.mode()[0]))
e4['ransom'] = e4['ransom'].replace(-9,np.nan)
e4['ransom'] = e4.groupby("gname").ransom.transform(lambda x:x.fillna(x.mode()[0]))
# INT_variables: -9 = unknown/missing
e4['INT_ANY'] = e4['INT_ANY'].replace(-9,np.nan)
e4['INT_ANY'] = e4.groupby("gname").INT_ANY.transform(lambda x:x.fillna(x.mode()[0]))
# weapsubtype1 = 13 = unknown/missing
e4['weapsubtype1'] = e4['weapsubtype1'].replace(13,np.nan)
e4['weapsubtype1'] = e4.groupby("gname").weapsubtype1.transform(lambda x:x.fillna(x.mode()[0]))
# property = -9 = unknown/missing
e4['property'] = e4['property'].replace(-9,np.nan)
e4['property'] = e4.groupby("gname").property.transform(lambda x:x.fillna(x.mode()[0]))
# targsubtype1 has 1963 missingvalues
e4['targsubtype1'] = e4.groupby("gname").targsubtype1.transform(lambda x:x.fillna(x.mode()[0]))
# natlty1 has 329 missingvalues
e4['natlty1'] = e4.groupby("country").natlty1.transform(lambda x:x.fillna(x.mode()[0]))
# attacktype1 9 = unknown/missing
e4['attacktype1'] = e4['attacktype1'].replace(9,np.nan)
e4['attacktype1'] = e4.groupby("gname").attacktype1.transform(lambda x:x.fillna(x.mode()[0]))
#targtype1 20 = unknown/missing
e4['targtype1'] = e4['targtype1'].replace(13,np.nan)
e4['targtype1'] = e4.groupby("gname").targtype1.transform(lambda x:x.fillna(x.mode()[0]))
return e4
e4 = impute_missing()
| Variable | Description | Type |
|---|---|---|
| ransom | 1 = "Yes" The incident involved a demand of monetary ransom; 0 = "No" The incident did not involve a demand of monetaryransom; -9 = "Unknown" It is unknown if the incident involved a demand ofmonetary ransom. | Categorical |
| nwound | This field records the number of confirmed non-fatal injuries to both perpetrators and victims | Numeric |
| INT_ANY | 1 = "Yes" The attack was international on any of the dimensionsdescribed above (logistically, ideologically, miscellaneous);0 = "No" The attack was domestic on all of the dimensions describedabove (logistically, ideologically, miscellaneous);-9 = "Unknown" It is unknown if the attack was international or domestic; thevalue for one or more dimensions is unknown. | Categorical |
| provstate | This variable records the name (at the time of event) of the 1st order subnationaladministrative region in which the event occurs. | Text |
| multiple | In those cases where several attacks are connected, but where the various actions do not constitute a single incident (either the time of occurrence of incidents or their locations are discontinuous – see Single Incident Determination section above), then “Yes” is selected to denote that the particular attack was part of a “multiple” incident. 1 = "Yes" The attack is part of a multiple incident. 0 = "No" The attack is not part of a multiple incident | Categorical |
| gname | Attack group | Categorical |
| nwoundte | Number of Perpetrators Injured | Numeric |
| nkill | This field stores the number of total confirmed fatalities for the incident. The number includes all victims and attackers who died as a direct result of the inciden | Numeric |
| iyear | This field contains the year in which the incident occurred. | Categorical |
| vicinity | 1 = "Yes" The incident occurred in the immediate vicinity of the city in question. 0 = "No" The incident in the city itself. | Categorical |
| success | 1 = "Yes" The incident was successful. 0 = "No" The incident was not successful | Categorical |
| imonth | This field contains the month in which the incident occurred. | Categorical |
| nperps | Number of Perpetrators | |
| targsubtype1 | The target subtype variable captures the more specific target category and providesthe next level of designation for each target type. | Categorical |
| property | “Yes” appears if there is evidence of property damage from the incident.1 = "Yes" The incident resulted in property damage.0 = "No" The incident did not result in property damage.-9 = "Unknown" It is unknown if the incident resulted in propertydamage | Categorical |
| crit1 | The violent act must be aimed at attaining a political, economic, religious, orsocial goal. This criterion is not satisfied in those cases where the perpetrator(s)acted out of a pure profit motive or from an idiosyncratic personal motiveunconnected with broader societal change.1 = "Yes" The incident meets Criterion 1.0 = "No" The incident does not meet Criterion 1 or there is no indicationthat the incident meets Criterion 1. | Categorical |
| suicide | This variable is coded “Yes” in those cases where there is evidence that theperpetrator did not intend to escape from the attack alive.1 = "Yes" The incident was a suicide attack.0 = "No" There is no indication that the incident was a suicide attack. | Categorical |
| weaptype1 | Up to four weapon types are recorded for each incident. | Categorical |
| nkillter | Number of Perpetrator Fatalities | Numeric |
| natlty1 | Nationality of Target/Victim | Categorical |
| extended | 1 = "Yes" The duration of an incident extended more than 24 hours.0 = "No" The duration of an incident extended less than 24 hours. | Categorical |
| crit2 | To satisfy this criterion there must be evidence of an intention to coerce,intimidate, or convey some other message to a larger audience (or audiences)than the immediate victims. Such evidence can include (but is not limited to) thefollowing: pre- or post-attack statements by the perpetrator(s), past behavior bythe perpetrators, or the particular nature of the target/victim, weapon, or attacktype.1 = "Yes" The incident meets Criterion 2.0 = "No" The incident does not meet Criterion 2 or no indication. | Categorical |
| attacktype1 | This field captures the general method of attack and often reflects the broad class oftactics used | Categorical |
| weapsubtype1 | This field records a more specific value for most of the Weapon Types | Categorical |
| ishostkid | This field records whether or not the victims were taken hostage (i.e. held againsttheir will) or kidnapped (i.e. held against their will and taken to another location)during an incident.1 = "Yes" The victims were taken hostage or kidnapped.0 = "No" The victims were not taken hostage or kidnapped.Hostages or Kidnapping Victims 51GLOBAL TERRORISM DATABASE CODEBOOK ©UNIVERSITY OF MARYLAND JUNE 2017-9 = "Unknown" It is unknown if the victims were taken hostage orkidnapped. | Categorical |
| country | country or location where the incident occurred | Categorical |
| crit3 | The action is outside the context of legitimate warfare activities, insofar as ittargets non-combatants (i.e. the act must be outside the parameters permittedby international humanitarian law as reflected in the Additional Protocol to theGeneva Conventions of 12 August 1949 and elsewhere).1 = "Yes" The incident meets Criterion 3.0 = "No" The incident does not meet Criterion 3 | Categorical |
| targtype1 | The target/victim type field captures the general type of target/victim. | Categorical |
| region | This field identifies the region in which the incident occurred | Categorical |
Profiling Categorical variables
%matplotlib inline
def data_prof1(col):
s = e4.groupby(["gname",col]).size().reset_index(name='counts')
s[col] = s[col].astype('category')
t = "#Attacks by"+" "+col
p1 = (ggplot(s,aes(x='gname',y='counts',fill=col))+theme(figure_size=(11, 6), axis_text_x=element_text(rotation=90))+geom_col()+ labs(title=t))
print p1
list_feat = ["weaptype1","natlty1","property","region","success","targtype1",
"suicide","iyear","country","attacktype1","INT_ANY","ishostkid","ransom"]
for col in list_feat:
data_prof1(col)
Profiling categorical variables:
Analysis:
def data_prof_numeric(col1,col2):
s = e4.groupby(["gname",col1]).agg({col2:"sum","country":"count"}).reset_index()
s.columns = ["gname",col1,"# Attacks",col2]
t = "#Attacks"+" "+"grouped by"+" "+col1+""+"-"+""+"Heat mapping by"+" "+col2
p1 = (ggplot(s,aes(x='gname',y='# Attacks',fill=col2))+theme(figure_size=(11, 6), axis_text_x=element_text(rotation=90))+geom_col()+ facet_wrap((col1,))+labs(title=t))
print p1
list_feat1 = ["weaptype1","attacktype1","region"]
list_feat2 = ["nkill","nwound","nkillter","nwoundte","nperps"]
for col1 in list_feat1:
for col2 in list_feat2:
data_prof_numeric(col1,col2)
Profiling numeric variables:
Analysis:
Most of the data is concentrated to limited terrorist groups and attacktypes and specific regions. However, we will proceed with this knowledge.
e4.columns
Converting categorical variables to dummy variables
data = e4
cat_vars=['ransom','weapsubtype1','targsubtype1','natlty1','ishostkid','INT_ANY','iyear','property',
'extended','country','region','multiple','success','suicide','attacktype1','targtype1','imonth',
'weaptype1']
for var in cat_vars:
cat_list='var'+'_'+var
cat_list = pd.get_dummies(e4[var], prefix=var)
data1=data.join(cat_list)
data=data1
data_vars=data.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]
data_final=data[to_keep]
#Dropping following variables
to_drop1 = ['ransom', 'weapsubtype1', 'targsubtype1', 'natlty1', 'ishostkid',
'INT_ANY', 'iyear', 'property', 'extended', 'country', 'region',
'provstate', 'vicinity', 'crit1', 'crit2', 'crit3',
'multiple', 'success', 'suicide', 'attacktype1', 'targtype1', 'imonth', 'weaptype1']
data.drop(to_drop1, inplace=True, axis=1)
Before we divide our data intro training and testing, we will use only 50% of our original data for model training and validation. This is only for the purpose of my convenience to optimize on the run time of certain algorithms.
We will use sklearn function to split the training data in two datasets; 80/20 split. This is important, so we don't overfit our model. Meaning, the algorithm is so specific to a given subset, it cannot accurately generalize another subset, from the same dataset. It's important our algorithm has not seen the subset we will use to test, so it doesn't "cheat" by memorizing the answers. We will use sklearn's train_test_split function
print data_subset.shape
# Training and testing
seed = 10
data_subset = data.sample(frac=0.5, replace=True)
x = data_subset.drop(['gname'], axis=1)
y = data_subset['gname']
#Train test split
seed = 7
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.2,
random_state=42)
print data_subset.shape
Neural networks : Data Preparation
from sklearn.preprocessing import LabelBinarizer
#one-hot encoding
encoder = LabelBinarizer()
Y = encoder.fit_transform(y_train)
Y_test = encoder.fit_transform(y_test)
#Scaling features
X = StandardScaler().fit(x_train).transform(x_train)
X_test = StandardScaler().fit(x_test).transform(x_test)
print list(encoder.inverse_transform(Y_test[0:5]))
Neural networks : Model training using Keras
from keras.models import Sequential #Sequential Models
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD #Stochastic Gradient Descent Optimizer
def create_model(opt):
model = Sequential()
model.add(Dense(1024, input_dim=442))
model.add(Dropout(0.2))
model.add(Activation('sigmoid'))
model.add(Dense(512))
model.add(Dropout(0.3))
model.add(Activation('sigmoid'))
model.add(Dense(23))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer=opt,metrics=['accuracy'])
return model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model1 = create_model(sgd)
model2 = create_model("rmsprop")
model3 = create_model("adadelta")
history1 = model1.fit(X, Y, validation_data=(X_test,Y_test), nb_epoch=50, batch_size=2000)
history2 = model2.fit(X, Y, validation_data=(X_test,Y_test), nb_epoch=50, batch_size=2000)
history3 = model3.fit(X, Y, validation_data=(X_test,Y_test), nb_epoch=50, batch_size=2000)
Neural networks: Model Validation
#--------Plotting No of iterations vs. Categorical Cross entropy of a single neural network with optimiser = rmsprop
def plotting_cross_entropy():
plt.plot(history2.history['loss'],'o-')
plt.xlabel("Number of iterations")
plt.ylabel("Categorical cross entropy")
plt.title("Train error vs. no of iterations")
#--------Finding accuracy of the nn classifiers on test data
def nn_evaluation_train():
print('The training accuracy of neural network sgd classifier is {:.2f}'.format(np.mean(history1.history['acc'])*100))
print('The training accuracy of neural network rmsprop classifier is {:.2f}'.format(np.mean(history2.history['acc'])*100))
print('The training accuracy of neural network adadelta classifier is {:.2f}'.format(np.mean(history3.history['acc'])*100))
def nn_evaluation_test():
print('The validation accuracy of neural network sgd classifier is {:.2f}'.format(np.mean(history1.history['val_acc'])*100))
print('The validation accuracy of neural network rmsprop classifier is {:.2f}'.format(np.mean(history2.history['val_acc'])*100))
print('The validation accuracy of neural network adadelta classifier is {:.2f}'.format(np.mean(history3.history['val_acc'])*100))
#print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
#scores = model1.evaluate(X_test, Y_test, verbose=0)
#-------Extracting the class of the target variable post the predict operation
def class_extraction():
y_prob_model1 = model1.predict_proba(X_test)
y_class_model1 = y_prob_model1.argmax(axis=-1)
y_prob_model2 = model2.predict_proba(X_test)
y_class_model2 = y_prob_model2.argmax(axis=-1)
y_prob_model3 = model3.predict_proba(X_test)
y_class_model3 = y_prob_model3.argmax(axis=-1)
class_extraction()
nn_evaluation_train()
nn_evaluation_test()
plotting_cross_entropy()
In this section , I have been a simple xgboost model that provides us very good accuracy on the validation dataset.
from sklearn.preprocessing import LabelEncoder
#Encode the target variable
label_encoder = LabelEncoder()
label_encoder_train = label_encoder.fit(y_train)
Y_train = label_encoder_train.transform(y_train)
label_encoder_test = label_encoder.fit(y_test)
Y_test = label_encoder_test.transform(y_test)
#Scaling the features
X_train = StandardScaler().fit(x_train).transform(x_train)
X_test = StandardScaler().fit(x_test).transform(x_test)
#Training the model
xgbc = xgb.XGBClassifier(seed=42, n_estimators=50, early_stopping_rounds = 5, learning_rate=0.05)
xgbc.fit(X_train, Y_train)
#Evaluate on test data
def evaluate_xg():
preds_xgb = xgbc.predict(X_test)
print('Accuracy of Xgboost classifier is {}'.format(accuracy_score(y_tesY, preds_xgb)*100))
evaluate_xg()
In this section I have used randomforest models for training the data. This is another ensemble based model that can provide good results.
#Preparing training and testing data
factor = pd.factorize(data_subset['gname'])
data_subset.gname = factor[0]
definitions = factor[1]
x = data_subset.drop(['gname'], axis=1)
y = data_subset['gname']
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.1, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)
rfrst = RandomForestClassifier(n_jobs=4, n_estimators=50, max_features='sqrt', criterion = 'entropy', random_state=42)
rfrst.fit(X_train, y_train)
preds_rfrst = rfrst.predict(X_test)
print('Accuracy of Rf model on validation dataset is : {}'.format(accuracy_score(y_test, preds_rfrst)))
#knn classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')
knn.fit(X_train, y_train)
print('The accuracy of the knn classifier is {:.2f} on training data'.format(knn.score(X_train, y_train)))
print('The accuracy of the knn classifier is {:.2f} on test data'.format(knn.score(X_test, y_test)))
Another boosting technique useful for multi-class predictions.
from sklearn.ensemble import AdaBoostClassifier
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
#Adaboost classifier
dt = DecisionTreeClassifier()
model = AdaBoostClassifier(n_estimators=100,base_estimator=dt, random_state = 4)
model_ada = model.fit(X_train, y_train)
pred_ada = model_ada.predict(X_test)
print('Accuracy of adaboost model on validation dataset is : {}'.format(accuracy_score(y_test, pred_ada)))
#results = model_selection.cross_val_score(model_ada,X_test,y_test)
Having too many irrelavant features can sometimes decrease the accuracy of the models. Feature selection provides the following benefits:
#Encode the target variable
x = data_subset.drop(['gname'], axis=1)
y = data_subset["gname"]
label_encoder = LabelEncoder()
label_encoder_train = label_encoder.fit(y_train)
Y_train = label_encoder_train.transform(y_train)
label_encoder_test = label_encoder.fit(y_test)
Y_test = label_encoder_test.transform(y_test)
#Scaling the features
X_train = StandardScaler().fit(x_train).transform(x_train)
X_test = StandardScaler().fit(x_test).transform(x_test)
#Feature Importance
model_feature_imp = ExtraTreesClassifier(n_estimators=250,random_state=0)
model_feature_imp.fit(X_train, Y_train)
list_feat = {}
for feature, importance in zip(x.columns, model_feature_imp.feature_importances_):
list_feat[feature] = importance
importances = pd.DataFrame.from_dict(list_feat, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90)
table = importances.sort_values(by='Gini-importance',ascending= False)
table.head(50)
Looks like the top features include region, nationality of incident place, weapon type, etc.
| Model | Accuracy |
|---|---|
| nn_sgd | 60% |
| nn_rmsprop | 92% |
| nn_adadelta | 87% |
| xgboost | 98% |
| randomforest | 99% |
| knn | 89% |
| adaboost | 99% |
Following are some other things that I haven't gotten to but extremely important to improve the accuracy of our models.